In [18]:
import os
from random import randint
from collections import Counter
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import zipfile
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
In [28]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = './data/'
FILE_NAME = 'text8.zip'
In [29]:
def download(file_name, expected_bytes):
""" Download the dataset text8 if it's not already downloaded """
file_path = DATA_FOLDER + file_name
if os.path.exists(file_path):
print("Dataset ready")
return file_path
file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
file_stat = os.stat(file_path)
if file_stat.st_size == expected_bytes:
print('Successfully downloaded the file', file_name)
else:
raise Exception('File ' + file_name +
' might be corrupted. You should try downloading it with a browser.')
return file_path
In [30]:
def read_data(file_path):
""" Read data into a list of tokens
There should be 17,005,207 tokens
"""
with zipfile.ZipFile(file_path) as f:
words = tf.compat.as_str(f.read(f.namelist()[0])).split()
# tf.compat.as_str() converts the input into the string
return words
In [20]:
# corpus = 'the quick brown fox jumped over the lazy dog'
corpus = read_data(DATA_FOLDER + FILE_NAME)
corpus[:10]
Out[20]:
In [21]:
def build_vocab(words, vocab_size):
""" Build vocabulary of VOCAB_SIZE most frequent words """
dictionary = dict()
count = [('UNK', -1)]
count.extend(Counter(words).most_common(vocab_size - 1))
index = 0
with open('vocabulary/vocab_1000.tsv', "w") as f:
for word, _ in count:
dictionary[word] = index
if index < 1000:
f.write(word + "\n")
index += 1
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return dictionary, index_dictionary
In [22]:
vocabulary, reverse_vocabulary = build_vocab(corpus, 5000)
In [23]:
len(vocabulary)
Out[23]:
In [6]:
def index_words_in_corpus(corpus):
return [vocabulary[token] if token in vocabulary else 0 for token in corpus]
In [24]:
corpus = index_words_in_corpus(corpus)
In [25]:
def generate_sample(index_words, context_window_size):
""" Form training pairs according to the skip-gram model. """
for index, center in enumerate(index_words):
context = randint(1, context_window_size)
# get a random target before the center word
for target in index_words[max(0, index - context): index]:
yield center, target
# get a random target after the center wrod
for target in index_words[index + 1: index + context + 1]:
yield center, target
In [26]:
# [_ for _ in generate_sample(corpus, 2)]
In [27]:
def get_batch(iterator, batch_size):
""" Group a numerical stream into batches and yield them as Numpy arrays. """
while True:
center_batch = np.zeros(batch_size, dtype=np.int32)
target_batch = np.zeros([batch_size, 1], dtype=np.int32)
for index in range(batch_size):
center_batch[index], target_batch[index] = next(iterator)
yield center_batch, target_batch
In [11]:
# [_ for _ in get_batch(generate_sample(corpus, 1), 2)]
In [12]:
VOCAB_SIZE = 5000
BATCH_SIZE = 32
EMBED_SIZE = 10 # dimension of the word embedding vectors
SKIP_WINDOW = 3 # the context window
NUM_SAMPLED = 16 # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 100 # how many steps to skip before reporting the loss
In [13]:
class SkipGramModel:
""" Build the graph for word2vec model """
def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
self.vocab_size = vocab_size
self.embed_size = embed_size
self.batch_size = batch_size
self.num_sampled = num_sampled
self.lr = learning_rate
self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
def _create_placeholders(self):
""" Step 1: define the placeholders for input and output """
with tf.name_scope("data"):
self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')
def _create_embedding(self):
""" Step 2: define weights. In word2vec, it's actually the weights that we care about """
with tf.name_scope("embed"):
self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,
self.embed_size], -1.0, 1.0),
name='embed_matrix')
def _create_loss(self):
""" Step 3 + 4: define the model + the loss function """
with tf.name_scope("loss"):
# Step 3: define the inference
embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')
# Step 4: define loss function
# construct variables for NCE loss
nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
stddev=1.0 / (self.embed_size ** 0.5)),
name='nce_weight')
nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')
# define loss function to be NCE loss function
self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
biases=nce_bias,
labels=self.target_words,
inputs=embed,
num_sampled=self.num_sampled,
num_classes=self.vocab_size), name='loss')
def _create_optimizer(self):
""" Step 5: define optimizer """
self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
def _create_summaries(self):
with tf.name_scope("summaries"):
tf.summary.scalar("loss", self.loss)
tf.summary.histogram("histogram_loss", self.loss)
# because you have several summaries, we should merge them all
# into one op to make it easier to manage
self.summary_op = tf.summary.merge_all()
def build_graph(self):
""" Build the graph for our model """
self._create_placeholders()
self._create_embedding()
self._create_loss()
self._create_optimizer()
self._create_summaries()
def train_model(model, batch_gen, num_train_steps):
saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias
initial_step = 0
# utils.make_dir('checkpoints')
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(os.path.dirname('./tf_checkpoints/checkpoint'))
# if that checkpoint exists, restore from checkpoint
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
writer = tf.summary.FileWriter('./tf_graphs/lr' + str(LEARNING_RATE), sess.graph)
initial_step = model.global_step.eval()
for index in range(initial_step, initial_step + num_train_steps):
centers, targets = next(batch_gen)
feed_dict={model.center_words: centers, model.target_words: targets}
loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],
feed_dict=feed_dict)
writer.add_summary(summary, global_step=index)
total_loss += loss_batch
if (index + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
total_loss = 0.0
saver.save(sess, './tf_checkpoints/skip-gram', index)
####################
# code to visualize the embeddings. uncomment the below to visualize embeddings
# run "'tensorboard --logdir='processed'" to see the embeddings
final_embed_matrix = sess.run(model.embed_matrix)
# it has to variable. constants don't work here. you can't reuse model.embed_matrix
embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
sess.run(embedding_var.initializer)
config = projector.ProjectorConfig()
summary_writer = tf.summary.FileWriter('./processed')
# add embedding to the config file
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name
# link this tensor to its metadata file, in this case the first 500 words of vocab
embedding.metadata_path = './vocabulary/vocab_1000.tsv'
# saves a configuration file that TensorBoard will read during startup.
projector.visualize_embeddings(summary_writer, config)
saver_embed = tf.train.Saver([embedding_var])
saver_embed.save(sess, './processed/model3.ckpt', 1)
In [14]:
model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
model.build_graph()
batch_generator = get_batch(generate_sample(corpus, SKIP_WINDOW), BATCH_SIZE)
In [15]:
train_model(model, batch_generator, NUM_TRAIN_STEPS)
In [ ]: